Four Maps of Harrisonburg¶
In [ ]:
import ipyparallel as ipp
cluster = ipp.Cluster.from_file("C:\Users\joost\.ipython\profile_default\security\cluster-.json")
rc = cluster.connect_client_sync()
rc
The following code creates four different maps of Harrisonburg.
In [45]:
import os
import zipfile
import pandas as pd
import geopandas as gpd
In [46]:
# Set directory path
data_dir = 'data'
# Check for .zip files in the directory
zip_files = [f for f in os.listdir(data_dir) if f.endswith('.zip')]
# Unzip all found .zip files
for zip_file in zip_files:
with zipfile.ZipFile(os.path.join(data_dir, zip_file), 'r') as zip_ref:
zip_ref.extractall(data_dir)
# Now, look for .kml files in the directory after unzipping
kml_files = [f for f in os.listdir(data_dir) if f.endswith('.kml')]
In [47]:
# Initialize an empty list to store data
data = []
# Iterate over .kml files, reading each into GeoDataFrame and extracting necessary information
for kml_file in kml_files:
kml_path = os.path.join(data_dir, kml_file)
try:
# Read the KML file with geopandas
gdf = gpd.read_file(kml_path)
# Append each row of the GeoDataFrame with the file name to our data list
for _, row in gdf.iterrows():
data.append({'file_name': kml_file, 'geometry': row['geometry']})
except Exception as e:
print(f"Could not read {kml_file}: {e}")
# Convert the data list into a DataFrame
df_kml = pd.DataFrame(data)
In [48]:
import re
df_kml['file_id'] = df_kml['file_name'].apply(lambda x: re.search(r'_(\d+)_', x).group(1) if re.search(r'_(\d+)_', x) else None)
df_kml = df_kml.drop(columns=['file_name'])
In [49]:
names_df = pd.read_csv("data/names.csv")
names_df['ID'] = names_df['ID'].astype(str)
# Map the 'Name' column in df_kml based on matching 'file_id' with 'ID' in names_df
df_kml = df_kml.merge(names_df, left_on='file_id', right_on='ID', how='left').drop(columns=['ID'])
# Rename 'Student' to 'Name' in df_kml
df_kml = df_kml.rename(columns={'Student': 'Name'})
In [50]:
df_majors = pd.read_csv("data/names_majors.csv")
In [51]:
df_majors['Program and Plan'] = df_majors['Program and Plan'].apply(
lambda x: re.search(r' -\s*(.*?)\s*-', x).group(1) if isinstance(x, str) and re.search(r' -\s*(.*?)\s*-', x) else None
)
df_majors['Name'] = df_majors['Name'].str.replace(',', ', ', regex=False)
In [52]:
df_majors = df_majors.rename(columns={'Program and Plan': 'Program'})
In [53]:
# Convert the 'ID' in df_majors and 'file_id' in df_kml to strings if they're not already
df_majors['ID'] = df_majors['ID'].astype(str)
df_kml['file_id'] = df_kml['file_id'].astype(str)
# Merge df_kml with df_majors on the ID columns
df_merged = df_kml.merge(df_majors, left_on='Name', right_on='Name', how='left')
# Drop the redundant 'ID' column if you only want to keep 'file_id'
df_merged = df_merged.drop(columns=['ID'])
In [54]:
import pandas as pd
import plotly.express as px
import geopandas as gpd
# Calculate longitude and latitude for each geometry
df_merged['lon'] = df_merged['geometry'].apply(lambda geom: geom.centroid.x if geom else None)
df_merged['lat'] = df_merged['geometry'].apply(lambda geom: geom.centroid.y if geom else None)
Grouped by name¶
In [56]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
df_merged,
lat="lat",
lon="lon",
color="Name",
title="Map of Geometries by KML File",
hover_name="Name",
mapbox_style="carto-positron", # Black and white style
zoom=10 # Adjust the zoom level as needed
)
# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
mapbox=dict(
center=dict(lat=38.4496, lon=-78.8689),
zoom=10 # Higher zoom level for more focus on the center
),
margin={"r":0, "t":0, "l":0, "b":0}
)
# Show the map
fig.show()
Grouped by Program¶
In [58]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
df_merged,
lat="lat",
lon="lon",
color="Program",
title="Map of Geometries by KML File",
hover_name="Name",
mapbox_style="carto-positron", # Black and white style
zoom=10 # Adjust the zoom level as needed
)
# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
mapbox=dict(
center=dict(lat=38.4496, lon=-78.8689),
zoom=10 # Higher zoom level for more focus on the center
),
margin={"r":0, "t":0, "l":0, "b":0}
)
# Show the map
fig.show()
Group by level¶
In [60]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
df_merged,
lat="lat",
lon="lon",
color="Level",
title="Map of Geometries by KML File",
hover_name="Name",
mapbox_style="carto-positron", # Black and white style
zoom=10 # Adjust the zoom level as needed
)
# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
mapbox=dict(
center=dict(lat=38.4496, lon=-78.8689),
zoom=10 # Higher zoom level for more focus on the center
),
margin={"r":0, "t":0, "l":0, "b":0}
)
# Show the map
fig.show()
In [61]:
import numpy as np
from scipy.spatial.distance import pdist
# Ensure lat and lon columns exist by calculating the centroids in df_merged if they don't exist
if 'lat' not in df_merged.columns or 'lon' not in df_merged.columns:
df_merged['lon'] = df_merged['geometry'].apply(lambda geom: geom.centroid.x if geom else None)
df_merged['lat'] = df_merged['geometry'].apply(lambda geom: geom.centroid.y if geom else None)
# Step 1: Calculate the average distance within each student's points
def average_distance_within_group(group):
if len(group) < 2: # If only one point, average distance is zero
return 0
coords = group[['lat', 'lon']].values
return np.mean(pdist(coords)) # Calculate pairwise distances and take the mean
# Calculate avg_distance for each student and store as a DataFrame
student_avg_dist = df_merged.groupby('Name', as_index=False, group_keys=False).apply(
lambda group: pd.Series({
'avg_distance': average_distance_within_group(group)
})
)
C:\Users\joost\AppData\Local\Temp\ipykernel_81680\3016280905.py:18: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
In [63]:
# Ensure 'avg_distance' is numeric
student_avg_dist['avg_distance'] = pd.to_numeric(student_avg_dist['avg_distance'], errors='coerce').fillna(0).astype(float)
# Step 2: Sort by avg_distance and divide students into 4 equal groups
student_avg_dist['group'] = pd.qcut(student_avg_dist['avg_distance'], 4, labels=False)+1
student_avg_dist['group'] = student_avg_dist['group'].astype(str)
In [65]:
# Step 3: Merge the 'group' information back into df_merged for visualization
df_merged['Name'] = df_merged['Name'].astype(str).str.strip()
student_avg_dist['Name'] = student_avg_dist['Name'].astype(str).str.strip()
df_merged = df_merged.merge(student_avg_dist[['Name', 'group']], on='Name', how='left')
In [67]:
# Create a new DataFrame with group names and unique student lists
df_merged['FirstName'] = df_merged['Name'].apply(
lambda x: f"{x.split(', ')[1]} {x.split(', ')[0][0]}." if ', ' in x else x
)
In [69]:
grouped_names = (
df_merged.groupby('group')['FirstName']
.unique()
.apply(lambda names: ', '.join(names))
.reset_index()
)
# Create the "Group" label with group number and names
grouped_names['Group'] = grouped_names['group'].apply(lambda x: f"Group {x}")
grouped_names['Group'] = grouped_names['Group'] + ': ' + grouped_names['FirstName']
# Merge the new 'Group' label back into df_merged
df_merged = df_merged.merge(grouped_names[['group', 'Group']], on='group', how='left')
Clustered by distance from center¶
In [99]:
# Plot the map with custom hover text for each group
fig = px.scatter_mapbox(
df_merged,
lat="lat",
lon="lon",
color="Group", # Color by group
title="Student Groups Based on Clustering of Locations",
hover_name="Name",
mapbox_style="carto-positron", # Black and white style
zoom=10
)
# Set custom hover template for all points to show group name lists
fig.update_traces(hovertemplate="<b>%{custom_hover}</b><extra></extra>")
# Center the map on Harrisonburg, VA (38.4496, -78.8689)
fig.update_layout(
mapbox=dict(
center=dict(lat=38.4496, lon=-78.8689),
zoom=10
),
margin={"r":0, "t":0, "l":0, "b":0}
)
# Show the map
fig.show()
In [ ]: